The objective of this project is to develop a binary classification model that helps predict whether an European retail bank customer is going to churn or not. I conducted the project as part of a Kaggle competition.
from __future__ import print_function
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import itertools
import warnings
warnings.filterwarnings("ignore")
import os
import io
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly
import plotly.tools as tls
import plotly.figure_factory as ff
data = pd.read_csv('Churn_Modelling.csv')
data.head()
| RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 15634602 | Hargrave | 619 | France | Female | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 |
| 1 | 2 | 15647311 | Hill | 608 | Spain | Female | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 |
| 2 | 3 | 15619304 | Onio | 502 | France | Female | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 |
| 3 | 4 | 15701354 | Boni | 699 | France | Female | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 |
| 4 | 5 | 15737888 | Mitchell | 850 | Spain | Female | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 |
# Converting all columns heading to lowercase
clean_column_name = []
columns = data.columns
for i in range(len(columns)):
clean_column_name.append(columns[i].lower())
data.columns = clean_column_name
# Droping irrelevant columns
data = data.drop(["rownumber", "customerid", "surname"], axis = 1)
# Separating churn and non churn customers
churn = data[data["exited"] == 1]
not_churn = data[data["exited"] == 0]
# Separating categorical and numerical columns
target_col = ["exited"]
cat_cols = data.nunique()[data.nunique() < 6].keys().tolist()
cat_cols = [x for x in cat_cols if x not in target_col]
num_cols = [x for x in data.columns if x not in cat_cols + target_col]
cat_cols
['geography', 'gender', 'numofproducts', 'hascrcard', 'isactivemember']
num_cols
['creditscore', 'age', 'tenure', 'balance', 'estimatedsalary']
data.head()
| creditscore | geography | gender | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619 | France | Female | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 |
| 1 | 608 | Spain | Female | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 |
| 2 | 502 | France | Female | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 |
| 3 | 699 | France | Female | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 |
| 4 | 850 | Spain | Female | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 |
# Viewing dimension of the dataset
data.shape
(10000, 11)
# Checking for unique value in the data attributes
data.nunique()
creditscore 460 geography 3 gender 2 age 70 tenure 11 balance 6382 numofproducts 4 hascrcard 2 isactivemember 2 estimatedsalary 9999 exited 2 dtype: int64
# Describing the all statistical properties of dataset
data[data.columns[:10]].describe()
| creditscore | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | |
|---|---|---|---|---|---|---|---|---|
| count | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.00000 | 10000.000000 | 10000.000000 |
| mean | 650.528800 | 38.921800 | 5.012800 | 76485.889288 | 1.530200 | 0.70550 | 0.515100 | 100090.239881 |
| std | 96.653299 | 10.487806 | 2.892174 | 62397.405202 | 0.581654 | 0.45584 | 0.499797 | 57510.492818 |
| min | 350.000000 | 18.000000 | 0.000000 | 0.000000 | 1.000000 | 0.00000 | 0.000000 | 11.580000 |
| 25% | 584.000000 | 32.000000 | 3.000000 | 0.000000 | 1.000000 | 0.00000 | 0.000000 | 51002.110000 |
| 50% | 652.000000 | 37.000000 | 5.000000 | 97198.540000 | 1.000000 | 1.00000 | 1.000000 | 100193.915000 |
| 75% | 718.000000 | 44.000000 | 7.000000 | 127644.240000 | 2.000000 | 1.00000 | 1.000000 | 149388.247500 |
| max | 850.000000 | 92.000000 | 10.000000 | 250898.090000 | 4.000000 | 1.00000 | 1.000000 | 199992.480000 |
# Calculating percentage per category for the target column
percentage_labels = data['exited'].value_counts(normalize = True) * 100
percentage_labels
0 79.63 1 20.37 Name: exited, dtype: float64
# Representation of the target label percentage.
total_len = len(data['exited'])
sns.set()
sns.countplot(data.exited).set_title('Data Distribution')
ax = plt.gca()
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x() + p.get_width()/2.,
height + 2,
'{:.2f}%'.format(100 * (height/total_len)),
fontsize=14, ha='center', va='bottom')
sns.set(font_scale=1.5)
ax.set_xlabel("Labels for exited column")
ax.set_ylabel("Numbers of records")
plt.show()
From this chart, one can see that 20% of the customers respresented by the dataframe have churned and 80% of the customers have not churned.
# Function for visualising customer attribution types
def plot_pie(column) :
trace1 = go.Pie(values = churn[column].value_counts().values.tolist(),
labels = churn[column].value_counts().keys().tolist(),
hoverinfo = "label+percent+name",
domain = dict(x = [0,.48]),
name = "Churn Customers",
marker = dict(line = dict(width = 2,
color = "rgb(243,243,243)")
),
hole = .6
)
trace2 = go.Pie(values = not_churn[column].value_counts().values.tolist(),
labels = not_churn[column].value_counts().keys().tolist(),
hoverinfo = "label+percent+name",
marker = dict(line = dict(width = 2,
color = "rgb(243,243,243)")
),
domain = dict(x = [.52,1]),
hole = .6,
name = "Non churn customers"
)
layout = go.Layout(dict(title = column + " distribution in customer attrition ",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
annotations = [dict(text = "churn customers",
font = dict(size = 13),
showarrow = False,
x = .15, y = .5),
dict(text = "Non churn customers",
font = dict(size = 13),
showarrow = False,
x = .88,y = .5
)
]
)
)
data = [trace1,trace2]
fig = go.Figure(data = data,layout = layout)
plotly.offline.iplot(fig)
# Visualising geography
plot_pie(cat_cols[0])
The graph above shows us that among the churned customers those who are are geographycally located in Germay have yhe highest rate of churn with 40%, followed by France with 39.8% and Spain with 20.3%. For non chun customers France is leading with 52.8%, Spain with 25.9% and Germany with 21.3%.
# Visualising gender
plot_pie(cat_cols[1])
The output above shows us that for the churn customers female have 55.9%, whereas male with 44.1%. For the case of non churn customers 57.3% are male and 42.7% are female.
# Visualising numofproducts
plot_pie(cat_cols[2])
The graph above shows that among the churn customers, the rate of those who use one product is very high with $69.2\%$, followed by those who use two products with $17.1\%$, three products with $10.8\%$, and four products with $2.95\%$. For non churn customers, customers with two products are $53.3\%$, one product are $46.2\%$, and three products are $0.58\%$.
# VCisualising hascrcard
plot_pie(cat_cols[3])
The output above shows us that for the churn customers those who possess a card are 69.9%, whereas those don't possess are 30.1%. For the case of non churn customers 70.7% possess a card and $29.3\%$ don't possess a card.
# Visualising geography
plot_pie(cat_cols[4])
The output above shows us that the among the churned customers those who are not active members have a high rate of churn with 63.9%, and active members with 36.1%. For non chun customers active members are leading with 55.5%, and non active members with 44.5%.
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()
def histogram(column) :
trace1 = go.Histogram(x = churn[column],
histnorm= "percent",
name = "Churn Customers",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
trace2 = go.Histogram(x = not_churn[column],
histnorm = "percent",
name = "Non churn customers",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
data = [trace1,trace2]
layout = go.Layout(dict(title =column + " distribution in customer attrition ",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = column,
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "percent",
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
)
)
fig = go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)
# Visualising creditscore
histogram(num_cols[0])
# Visualising age
histogram(num_cols[1])
The graph above shows us that the customers with age of 46 are the most churned.
# Visualising tenure
histogram(num_cols[2])
# Visualising balance
histogram(num_cols[3])
# Visualising estimatedsalary
histogram(num_cols[4])
data.isnull().sum()
creditscore 0 geography 0 gender 0 age 0 tenure 0 balance 0 numofproducts 0 hascrcard 0 isactivemember 0 estimatedsalary 0 exited 0 dtype: int64
# Producing correlation matrix
data[data.columns].corr()
| creditscore | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|
| creditscore | 1.000000 | -0.003965 | 0.000842 | 0.006268 | 0.012238 | -0.005458 | 0.025651 | -0.001384 | -0.027094 |
| age | -0.003965 | 1.000000 | -0.009997 | 0.028308 | -0.030680 | -0.011721 | 0.085472 | -0.007201 | 0.285323 |
| tenure | 0.000842 | -0.009997 | 1.000000 | -0.012254 | 0.013444 | 0.022583 | -0.028362 | 0.007784 | -0.014001 |
| balance | 0.006268 | 0.028308 | -0.012254 | 1.000000 | -0.304180 | -0.014858 | -0.010084 | 0.012797 | 0.118533 |
| numofproducts | 0.012238 | -0.030680 | 0.013444 | -0.304180 | 1.000000 | 0.003183 | 0.009612 | 0.014204 | -0.047820 |
| hascrcard | -0.005458 | -0.011721 | 0.022583 | -0.014858 | 0.003183 | 1.000000 | -0.011866 | -0.009933 | -0.007138 |
| isactivemember | 0.025651 | 0.085472 | -0.028362 | -0.010084 | 0.009612 | -0.011866 | 1.000000 | -0.011421 | -0.156128 |
| estimatedsalary | -0.001384 | -0.007201 | 0.007784 | 0.012797 | 0.014204 | -0.009933 | -0.011421 | 1.000000 | 0.012097 |
| exited | -0.027094 | 0.285323 | -0.014001 | 0.118533 | -0.047820 | -0.007138 | -0.156128 | 0.012097 | 1.000000 |
# Visualising of correlation matrix using heatmap plot
sns.set()
sns.set(font_scale = 1.25)
sns.heatmap(data[data.columns[:10]].corr(), annot = True,fmt = ".1f")
plt.show()
# Function for detecting outliers
trace = []
def gen_boxplot(df):
for feature in df:
trace.append(
go.Box(
name = feature,
y = df[feature]
)
)
new_df = data[num_cols[:1]]
gen_boxplot(new_df)
data_tr = trace
py.iplot(data_tr)
# Handling age column outliers
ageNew = []
for val in data.age:
if val <= 85:
ageNew.append(val)
else:
ageNew.append(data.age.median())
data.age = ageNew
# One-Hot encoding categorical attributes
list_cat = ['geography', 'gender']
data = pd.get_dummies(data, columns = list_cat, prefix = list_cat)
data.head()
| creditscore | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | geography_France | geography_Germany | geography_Spain | gender_Female | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619 | 42.0 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 | 1 | 0 | 0 | 1 | 0 |
| 1 | 608 | 41.0 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 | 0 | 0 | 1 | 1 | 0 |
| 2 | 502 | 42.0 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 | 1 | 0 | 0 | 1 | 0 |
| 3 | 699 | 39.0 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 | 1 | 0 | 0 | 1 | 0 |
| 4 | 850 | 43.0 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 | 0 | 0 | 1 | 1 | 0 |
from sklearn.ensemble import RandomForestClassifier
# Extracting features and label columns
X = data.drop('exited', axis=1)
y = data.exited
features_label = X.columns
forest = RandomForestClassifier (n_estimators = 10000, random_state = 0, n_jobs = -1)
forest.fit(X, y)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for i in range(X.shape[1]):
print ("%2d) %-*s %f" % (i + 1, 30, features_label[i], importances[indices[i]]))
1) creditscore 0.239871 2) age 0.146924 3) tenure 0.144463 4) balance 0.141905 5) numofproducts 0.128252 6) hascrcard 0.083890 7) isactivemember 0.039927 8) estimatedsalary 0.020739 9) geography_France 0.018099 10) geography_Germany 0.009935 11) geography_Spain 0.008867 12) gender_Female 0.008583 13) gender_Male 0.008545
# Visualising feature importances
plt.title('Feature Importances')
plt.bar(range(X.shape[1]), importances[indices], color = "blue", align = "center")
plt.xticks(range(X.shape[1]), features_label, rotation = 90)
plt.show()
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
# Splitting the dataset in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
# Initialising and fitting KNN
knMod = KNeighborsClassifier(n_neighbors = 5, weights = 'uniform', algorithm = 'auto', leaf_size = 30, p = 2,
metric = 'minkowski', metric_params = None)
knMod.fit(X_train, y_train)
KNeighborsClassifier()
# Initialising and fitting Logistic Regression
lrMod = LogisticRegression(penalty = 'l2', dual = False, tol = 0.0001, C = 1.0, fit_intercept = True,
intercept_scaling = 1, class_weight = None,
random_state = None, solver = 'liblinear', max_iter = 100,
multi_class = 'ovr', verbose = 2)
lrMod.fit(X_train, y_train)
[LibLinear]
LogisticRegression(multi_class='ovr', solver='liblinear', verbose=2)
# Initialising and fitting AdaBoost
adaMod = AdaBoostClassifier(base_estimator = None, n_estimators = 200, learning_rate = 1.0)
adaMod.fit(X_train, y_train)
AdaBoostClassifier(n_estimators=200)
# Initialising and fitting GradientBoosting
gbMod = GradientBoostingClassifier(loss = 'deviance', n_estimators = 200)
gbMod.fit(X_train, y_train)
GradientBoostingClassifier(n_estimators=200)
# Initialising anf fitting Random Forest
rfMod = RandomForestClassifier(n_estimators=10, criterion='gini')
rfMod.fit(X_train, y_train)
RandomForestClassifier(n_estimators=10)
# Function for testing models
def modtest(models, names):
accuracy = []
roc_auc = []
for model in models:
acc = model.score(X_test, y_test)
testlab = model.predict_proba(np.array(X_test.values))[:,1]
roc = roc_auc_score(y_test, testlab , average = 'macro', sample_weight = None)
accuracy.append(acc)
roc_auc.append(roc)
metrics = pd.DataFrame({'accuracy': accuracy, 'roc_auc': roc_auc}, index = names)
return metrics
# Testing models
model_names = ['KNN', 'Logistic Regression', 'AdaBoost', 'GradientBoosting', 'Random Forest']
model_base = [knMod, lrMod, adaMod, gbMod, rfMod]
mod_performance = modtest(model_base, model_names)
# Displaying models' performance
mod_performance
| accuracy | roc_auc | |
|---|---|---|
| KNN | 0.7650 | 0.527902 |
| Logistic Regression | 0.7885 | 0.646722 |
| AdaBoost | 0.8480 | 0.849538 |
| GradientBoosting | 0.8630 | 0.860326 |
| Random Forest | 0.8490 | 0.822468 |
# Recreating models' performance data
fpr_knn, tpr_knn, _ = roc_curve(y_test, knMod.predict_proba(np.array(X_test.values))[:,1])
fpr_lr, tpr_lr, _ = roc_curve(y_test, lrMod.predict_proba(np.array(X_test.values))[:,1])
fpr_ada, tpr_ada, _ = roc_curve(y_test, adaMod.predict_proba(np.array(X_test.values))[:,1])
fpr_gb, tpr_gb, _ = roc_curve(y_test, gbMod.predict_proba(np.array(X_test.values))[:,1])
fpr_rf, tpr_rf, _ = roc_curve(y_test, rfMod.predict_proba(np.array(X_test.values))[:,1])
# Ploting roc curves
plt.figure(figsize = (12,6), linewidth= 1)
plt.plot(fpr_knn, tpr_knn, label = 'KNN Score: ' + str(round(mod_performance.roc_auc[0], 5)))
plt.plot(fpr_lr, tpr_lr, label = 'LR score: ' + str(round(mod_performance.roc_auc[1], 5)))
plt.plot(fpr_ada, tpr_ada, label = 'AdaBoost Score: ' + str(round(mod_performance.roc_auc[2], 5)))
plt.plot(fpr_gb, tpr_gb, label = 'GB Score: ' + str(round(mod_performance.roc_auc[3], 5)))
plt.plot(fpr_rf, tpr_rf, label = 'RF score: ' + str(round(mod_performance.roc_auc[4], 5)))
plt.plot([0,1], [0,1], 'k--', label = 'Random guessing: 0.5')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC Curve ')
plt.legend(loc='best')
plt.show()
from sklearn.model_selection import cross_val_score
# Function for tracking mean standard deviation of accuracy
def crosval(models, scr, X_train = X, y_train = y, cv = 5):
cvdict = {}
for model in models:
cvscore = cross_val_score(model, X_train, y_train, cv = cv, scoring = scr)
cvdict[str(model).split('(')[0]] = [cvscore.mean(), cvscore.std()]
return cvdict
# Performing cross validation
cvdict = crosval(model_base, scr = 'roc_auc')
cvdict
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]
{'KNeighborsClassifier': [0.532941795209413, 0.015143901787688084],
'LogisticRegression': [0.6748251860125098, 0.01015424882762671],
'AdaBoostClassifier': [0.8439111429894497, 0.0050234645868154765],
'GradientBoostingClassifier': [0.8645775908153036, 0.00414460511418036],
'RandomForestClassifier': [0.8127473700025047, 0.0070867467315837]}
The dictionary shows how accuracy is changing throughout cross validation process for each model. The first number in lists contain mean (first number) and standard deviation (second number).
For hyperparameter tuning, we are choosting the top two performing models - AdaBoost & GradientBoosting
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
# Choosing parameters
adaHyperParams = {'n_estimators': [10,50,100,200,420]}
# Hyperparmeter tuning
randSearchAda = RandomizedSearchCV(estimator = adaMod, param_distributions = adaHyperParams, n_iter = 5,
scoring = 'roc_auc')
randSearchAda.fit(X_train, y_train)
RandomizedSearchCV(estimator=AdaBoostClassifier(n_estimators=200), n_iter=5,
param_distributions={'n_estimators': [10, 50, 100, 200,
420]},
scoring='roc_auc')
# Best parameters and roc_auc score
randSearchAda.best_params_, randSearchAda.best_score_
({'n_estimators': 100}, 0.8438014418928169)
The output above shows that the optimal value.
# Choosing parameters
gbHyperParams = {'loss' : ['deviance', 'exponential'],
'n_estimators': randint(10, 500),
'max_depth': randint(1,10)}
# Hyperparameter tuning with randomizeeed search
randSearchGB = RandomizedSearchCV(estimator = gbMod, param_distributions = gbHyperParams, n_iter = 10,
scoring = 'roc_auc')
randSearchGB.fit(X_train, y_train)
RandomizedSearchCV(estimator=GradientBoostingClassifier(n_estimators=200),
param_distributions={'loss': ['deviance', 'exponential'],
'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd2bf274640>,
'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd2befd2460>},
scoring='roc_auc')
# Best parameters and roc_auc score
randSearchGB.best_params_, randSearchGB.best_score_
({'loss': 'exponential', 'max_depth': 2, 'n_estimators': 459},
0.8614835205235138)
# GradientBoosting with the optimal parameters
bestGbModFitted = randSearchGB.best_estimator_.fit(X_train, y_train)
# AdaBoost with the optimal parameter
bestAdaModFitted = randSearchAda.best_estimator_.fit(X_train, y_train)
models = [bestGbModFitted, bestAdaModFitted]
cvbestpara = crosval(models, scr = 'roc_auc')
cvbestpara
{'GradientBoostingClassifier': [0.861036705098787, 0.003659747141184775],
'AdaBoostClassifier': [0.8475364341076641, 0.003971801529982983]}
# Getting the score GradientBoosting
test_labels = bestGbModFitted.predict_proba(np.array(X_test.values))[:,1]
roc_auc_score(y_test,test_labels , average = 'macro', sample_weight = None)
0.8614844633708163
# Getting the score AdaBoost
test_labels = bestAdaModFitted.predict_proba(np.array(X_test.values))[:,1]
roc_auc_score(y_test,test_labels , average = 'macro', sample_weight = None)
0.8516020542103448
from sklearn.preprocessing import FunctionTransformer, StandardScaler
# Applying standard scaling to train dataset
transformer = FunctionTransformer(np.log1p)
scaler = StandardScaler()
X_train_1 = np.array(X_train)
X_train_transform = scaler.fit_transform(X_train_1)
# Fitting model to transformed train data
bestGbModFitted_transformed = randSearchGB.best_estimator_.fit(X_train_transform, y_train)
bestAdaModFitted_transformed = randSearchAda.best_estimator_.fit(X_train_transform, y_train)
# Cross validating new model
cvbestpara_transform = crosval(models = [bestGbModFitted_transformed, bestAdaModFitted_transformed],
scr='roc_auc')
cvbestpara_transform
{'GradientBoostingClassifier': [0.8610370135736718, 0.003659149252290059],
'AdaBoostClassifier': [0.8475364341076641, 0.003971801529982983]}
# Transforming test dataset
X_test_1 = np.array(X_test)
X_test_transform = scaler.fit_transform(X_test_1)
# Predicting and valuating model performance
test_labels=bestGbModFitted_transformed.predict_proba(np.array(X_test_transform))[:,1]
roc_auc_score(y_test,test_labels , average = 'macro', sample_weight = None)
0.8598542817220042
from sklearn.ensemble import VotingClassifier
# Initialising and fitting voting model to scaled data
votingMod = VotingClassifier(estimators=[('gb', bestGbModFitted_transformed),
('ada', bestAdaModFitted_transformed)],
voting = 'soft', weights = [2,1])
votingMod = votingMod.fit(X_train_transform, y_train)
# Predicting
test_labels = votingMod.predict_proba(np.array(X_test_transform))[:,1]
votingMod.score(X_test_transform, y_test)
0.8605
# The roc_auc score
roc_auc_score(y_test, test_labels , average = 'macro', sample_weight = None)
0.859963162128975
# Fitting voting model to not scaled data
votingMod_old = votingMod.fit(X_train, y_train)
# Predicting
test_labels = votingMod_old.predict_proba(np.array(X_test.values))[:,1]
votingMod.score(X_test, y_test)
0.8585
# The roc_auc score
roc_auc_score(y_test,test_labels , average = 'macro', sample_weight = None)
0.8616371983861505
With transformed data, our voting-based ensemble model performs slightly better.
In this project I build a model that predicts how likely a customer is going to churn. During exploratory data analysis I found out that the female customer are who are located in Germany and also customer who are using only one product are the most likely customers to churne. After building, training and evalutaing various models, I chose best performing models, GradientBoosting and AdaBoost, to furether improve prediction accuracy through hyperparameter tuning, cross validation and ensembling. Since the problem was a binary classification with a imbalance dataset, I chose to use 'roc auc score' metric to evaluate models' performance, which was 87%. Furthermore, my best model's accuracy was 87%. To further improve the model's performance, I should gather more data for the training set.